import warnings # Ignore warning messages
import plotly.express as px # interactive charts
import pandas as pd # data manipulation
import numpy as np # linear algebra
warnings.filterwarnings("ignore")
df=pd.read_csv("insurance.csv")
df.head()
| age | sex | bmi | children | smoker | region | charges | |
|---|---|---|---|---|---|---|---|
| 0 | 19 | female | 27.900 | 0 | yes | southwest | 16884.92400 |
| 1 | 18 | male | 33.770 | 1 | no | southeast | 1725.55230 |
| 2 | 28 | male | 33.000 | 3 | no | southeast | 4449.46200 |
| 3 | 33 | male | 22.705 | 0 | no | northwest | 21984.47061 |
| 4 | 32 | male | 28.880 | 0 | no | northwest | 3866.85520 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1338 entries, 0 to 1337 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 1338 non-null int64 1 sex 1338 non-null object 2 bmi 1338 non-null float64 3 children 1338 non-null int64 4 smoker 1338 non-null object 5 region 1338 non-null object 6 charges 1338 non-null float64 dtypes: float64(2), int64(2), object(3) memory usage: 73.3+ KB
{col:list(df[col].unique()) for col in df.select_dtypes("object")}
{'sex': ['female', 'male'],
'smoker': ['yes', 'no'],
'region': ['southwest', 'southeast', 'northwest', 'northeast']}
def pie_plot(feature,colors):
fig=px.pie(df,names=feature,color=feature,
hole=0.3,color_discrete_map=colors,title=feature + " percent")
return fig.show()
pie_plot("smoker",{"yes":"red","no":"#77dd77"})
pie_plot("sex",{"male":"skyblue","female":"pink"})
pie_plot("region",{'southwest':"#fdcae1",
'southeast':'#84b6f4',
'northwest':'0096d2',
'northeast':"ffffbf"})
def scater_plot(x_var,title,colors):
fig=px.scatter(df,x=x_var,
y="charges",
trendline="ols",
color="smoker",
title=title,
color_discrete_map=colors)
return fig.show()
scater_plot("bmi","BMI vs charges",{"yes":"#77dd77","no":"#ff6961"})
The BMI regarding the price of insurance. Maintain a linear trend for smokers. Otherwise, it remains constant. And possibly contains outliers.
scater_plot("age","Age vs Charges",{"yes":"#77dd77","no":"#ff6961"})
The age variable is related to users who do not smoke. It maintains a linear trend.
fig=px.histogram(df,x="charges",
color="smoker",
color_discrete_map={"yes":"#77dd77","no":"#ff6961"},
marginal="box",title="Smoker vs Charges")
fig.show()
fig=px.box(df,x="smoker",y="charges",color="region",title="Smoker vs Region vs Charges")
fig.show()
Using the histogram and the box plot. We confirm the presence of outliers. So we have to give it special processing.
Technically we can give outliers the same treatment as missing values.
fig=px.histogram(df,x="bmi",
color="sex",
color_discrete_map={"male":"skyblue","female":"pink"},
marginal="box",title="Sex vs BMI")
fig.show()
With interactive graphics. We witness values that are out of the ordinary. In turn values with a low level of BMI. It is also appreciated that men have a higher BMI since they have greater physical complexion.
class lower_upper_limits():
def __init__(self,dataset,feature,limit):
self.dataset=dataset
self.mean=dataset[feature].mean()
self.std=dataset[feature].std()
self.limit=limit
def upper_limit(self):
return self.mean+self.limit*self.std
def lower_limit(self):
return self.mean-self.limit*self.std
lower_upper_limits(df,"bmi",2.5).upper_limit()
45.90886414018408
df["bmi"]=np.where(df["bmi"]>45.90,45.90,df["bmi"])
lower_upper_limits(df,"bmi",2).lower_limit()
18.588792692344214
df["bmi"]=np.where(df["bmi"]<18,18,df["bmi"])
fig=px.histogram(df,x="bmi",
color="sex",
color_discrete_map={"male":"skyblue","female":"pink"},
marginal="box",title="Sex vs BMI with feature eng ")
fig.show()
def split_smoker(condition):
return df.query("smoker=='{}'".format(condition))
smoker_no_split=split_smoker("no")
smoker_yes_split=split_smoker("yes")
lower_upper_limits(smoker_no_split,"charges",2).upper_limit()
20421.831936246064
smoker_no_split["charges"]=smoker_no_split["charges"].apply(lambda x: np.nan if x > 20000 else x)
((smoker_no_split.isnull().sum() / len(df))*100).sort_values(ascending = False)
charges 4.559043 age 0.000000 sex 0.000000 bmi 0.000000 children 0.000000 smoker 0.000000 region 0.000000 dtype: float64
def scatter_age(dataframe):
fig=px.scatter(dataframe,x="age",y="charges",trendline="ols")
return fig.show()
scatter_age(smoker_no_split)
We still see outliers. Therefore, I will decide to divide the dataset based on the age of the user. For better cleaning.
young_adults_split=smoker_no_split.query("age<30")
adults_split=smoker_no_split.query("age>=30 and age<50")
old_adults_split=smoker_no_split.query("age>=50")
def outlires_to_nan(dataframe,upper_limit):
return dataframe["charges"].apply(lambda x: np.nan if x>upper_limit else x)
young_adults_split["charges"]=outlires_to_nan(young_adults_split,7000)
adults_split["charges"]=outlires_to_nan(adults_split,10000)
old_adults_split["charges"]=outlires_to_nan(old_adults_split,18000)
smoker_no_split=pd.concat([young_adults_split,
adults_split,
old_adults_split])
scatter_age(smoker_no_split)
Now the data set is devoid of outliers. But to avoid data loss, we are going to create a model to be able to substitute null values. To have a better closeness than to replace them with a statistical measure.
I will obtain by creating a linear regression model. To replace those values. Since the variables of age and insurance charge. It has a linear trend for non-smokers.
smoker_no_clear=smoker_no_split.dropna() # Eliminate nan values
smoker_no_nan=smoker_no_split[smoker_no_split["charges"].isnull()] # Select nan values
X=smoker_no_clear.drop(["charges","smoker"],axis="columns")
y=smoker_no_clear.charges
from sklearn.model_selection import train_test_split,cross_val_score
X_train,X_test,Y_train,Y_test=train_test_split(X,y,test_size=0.33,random_state=0)
X_train.shape,X_test.shape
((636, 5), (314, 5))
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_transformer # Variables transform
from sklearn.preprocessing import OneHotEncoder,MinMaxScaler
tf_col=make_column_transformer((MinMaxScaler(),["age","bmi","children"]),
(OneHotEncoder(drop="if_binary"),["sex","region"]))
from sklearn.linear_model import LinearRegression
lm=Pipeline([("preprocessor",tf_col),("linear_model",LinearRegression())])
lm.fit(X_train,Y_train)
Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('minmaxscaler',
MinMaxScaler(),
['age', 'bmi', 'children']),
('onehotencoder',
OneHotEncoder(drop='if_binary'),
['sex'])])),
('linear_model', LinearRegression())])
lm.score(X_train,Y_train)
0.9748523817175202
lm.score(X_test,Y_test)
0.9780566985877265
cross_val_score(lm,X_test,Y_test,cv=5).mean()
0.9775607752327227
The generalization value was quite high. By which the linear model can describe 97% of the observations. We can use it to substitute missing values to avoid excessive data loss.
new_data=smoker_no_nan.drop(["charges"],axis="columns")
smoker_no_nan["charges"]=lm.predict(new_data)
smoker_no_nan.head()
| age | sex | bmi | children | smoker | region | charges | |
|---|---|---|---|---|---|---|---|
| 102 | 18 | female | 30.115 | 0 | no | northeast | 1102.515590 |
| 143 | 29 | male | 29.735 | 2 | no | northwest | 4445.436290 |
| 219 | 24 | female | 23.210 | 0 | no | southeast | 2723.927770 |
| 291 | 29 | male | 29.640 | 1 | no | northeast | 4000.599563 |
| 305 | 29 | male | 33.345 | 2 | no | northwest | 4432.998488 |
smoker_no_nan.isnull().sum()
age 0 sex 0 bmi 0 children 0 smoker 0 region 0 charges 0 dtype: int64
smoker_no_clear=pd.concat([smoker_no_clear,
smoker_no_nan])
fig=px.histogram(smoker_yes_split,x="charges")
fig.show()
smoker_yes_split["charges"]=smoker_yes_split["charges"].apply(lambda x: np.nan if x > 50000 else x)
smoker_yes_split.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 274 entries, 0 to 1337 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 274 non-null int64 1 sex 274 non-null object 2 bmi 274 non-null float64 3 children 274 non-null int64 4 smoker 274 non-null object 5 region 274 non-null object 6 charges 267 non-null float64 dtypes: float64(2), int64(2), object(3) memory usage: 17.1+ KB
Outliers for smokers. Convert them to missing values so you can replace them in a similar way as we did before.
smoker_yes_clear=smoker_yes_split.dropna() # eliminate nan values
smoker_yes_nan=smoker_yes_split[smoker_yes_split["charges"].isnull()] # Select nan values
X=smoker_yes_clear.drop(["charges","smoker"],axis="columns")
y=smoker_yes_clear.charges
X_train,X_test,Y_train,Y_test=train_test_split(X,y,test_size=0.33,random_state=0)
X_train.shape,X_test.shape
((178, 5), (89, 5))
from sklearn.ensemble import GradientBoostingRegressor
gbr_reg=GradientBoostingRegressor(max_depth=2,
learning_rate=0.01,
n_estimators=280,random_state=42)
gbr_reg=Pipeline([("preprocesor",tf_col),("model",gbr_reg)])
gbr_reg.fit(X_train,Y_train)
Pipeline(steps=[('preprocesor',
ColumnTransformer(transformers=[('minmaxscaler',
MinMaxScaler(),
['age', 'bmi', 'children']),
('onehotencoder',
OneHotEncoder(drop='if_binary'),
['sex', 'region'])])),
('model',
GradientBoostingRegressor(learning_rate=0.01, max_depth=2,
n_estimators=280,
random_state=42))])
gbr_reg.score(X_train,Y_train)
0.9357831302642619
gbr_reg.score(X_test,Y_test)
0.9042245238093337
new_data=smoker_yes_nan.drop(["charges"],axis="columns")
smoker_yes_nan["charges"]=gbr_reg.predict(new_data)
smoker_yes_clear=pd.concat([smoker_yes_clear,smoker_yes_nan])
df_clear=pd.concat([smoker_no_clear,smoker_yes_clear])
fig=px.box(data_frame=df_clear,
x="smoker",
y="charges",
color="smoker",
color_discrete_map={"yes":"#77dd77","no":"#ff6961"},
title="Feature engineering")
fig.show()
df_clear.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 1338 entries, 1 to 1300 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 1338 non-null int64 1 sex 1338 non-null object 2 bmi 1338 non-null float64 3 children 1338 non-null int64 4 smoker 1338 non-null object 5 region 1338 non-null object 6 charges 1338 non-null float64 dtypes: float64(2), int64(2), object(3) memory usage: 83.6+ KB
df_clear.to_csv("insurence-clear.csv")